{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将 composer 字段 进行多值离散型数据统计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle as pk\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import math\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "import re\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = '../Data/'  # 文件路径\n",
    "model_path = '../model/' # 模型路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(model_path+'data_all_train.pkl', 'rb') as fr:\n",
    "    data_all_train = pk.load(fr)\n",
    "fr.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>registration_init_time</th>\n",
       "      <th>expiration_date</th>\n",
       "      <th>song_length</th>\n",
       "      <th>genre_ids</th>\n",
       "      <th>artist_name</th>\n",
       "      <th>composer</th>\n",
       "      <th>lyricist</th>\n",
       "      <th>language</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>206471.0</td>\n",
       "      <td>359</td>\n",
       "      <td>Bastille</td>\n",
       "      <td>Dan Smith| Mark Crew</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Good Grief</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>284584.0</td>\n",
       "      <td>1259</td>\n",
       "      <td>Various Artists</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Lords of Cardboard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>225396.0</td>\n",
       "      <td>1259</td>\n",
       "      <td>Nas</td>\n",
       "      <td>N. Jones、W. Adams、J. Lordan、D. Ingle</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Hip Hop Is Dead(Album Version (Edited))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>255512.0</td>\n",
       "      <td>1019</td>\n",
       "      <td>Soundway</td>\n",
       "      <td>Kwadwo Donkoh</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>Disco Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>187802.0</td>\n",
       "      <td>1011</td>\n",
       "      <td>Brett Young</td>\n",
       "      <td>Brett Young| Kelly Archer| Justin Ebach</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Sleep Without You</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 msno  \\\n",
       "201969   FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1932462  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "183559   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "149511   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "74867    FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                              song_id source_system_tab  \\\n",
       "201969   BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
       "1932462  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
       "183559   JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
       "149511   2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
       "74867    3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
       "\n",
       "          source_screen_name      source_type  target  city  bd  gender  \\\n",
       "201969               Explore  online-playlist       1     1   0     NaN   \n",
       "1932462  Local playlist more   local-playlist       1    13  24  female   \n",
       "183559   Local playlist more   local-playlist       1    13  24  female   \n",
       "149511   Local playlist more   local-playlist       1    13  24  female   \n",
       "74867                Explore  online-playlist       1     1   0     NaN   \n",
       "\n",
       "         registered_via  registration_init_time  expiration_date  song_length  \\\n",
       "201969                7                20120102         20171005     206471.0   \n",
       "1932462               9                20110525         20170911     284584.0   \n",
       "183559                9                20110525         20170911     225396.0   \n",
       "149511                9                20110525         20170911     255512.0   \n",
       "74867                 7                20120102         20171005     187802.0   \n",
       "\n",
       "        genre_ids      artist_name                                 composer  \\\n",
       "201969        359         Bastille                     Dan Smith| Mark Crew   \n",
       "1932462      1259  Various Artists                                      NaN   \n",
       "183559       1259              Nas     N. Jones、W. Adams、J. Lordan、D. Ingle   \n",
       "149511       1019         Soundway                            Kwadwo Donkoh   \n",
       "74867        1011      Brett Young  Brett Young| Kelly Archer| Justin Ebach   \n",
       "\n",
       "        lyricist  language                                     name  \n",
       "201969       NaN      52.0                               Good Grief  \n",
       "1932462      NaN      52.0                       Lords of Cardboard  \n",
       "183559       NaN      52.0  Hip Hop Is Dead(Album Version (Edited))  \n",
       "149511       NaN      -1.0                             Disco Africa  \n",
       "74867        NaN      52.0                        Sleep Without You  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def composer_2_set(item_name, item_set):\n",
    "    pattern = r'[||/]'\n",
    "    item_name = str(item_name)\n",
    "    item_arr = re.split(pattern, item_name)\n",
    "    for item in item_arr:\n",
    "        item = item.strip()\n",
    "        item_set.add(item)\n",
    "    return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "composer_list_len: 82599\n"
     ]
    }
   ],
   "source": [
    "composer_set = set()\n",
    "data_all_train.composer.apply(lambda composer : composer_2_set(composer, composer_set))\n",
    "composer_list = list(composer_set)\n",
    "composer_list_len = len(composer_list)\n",
    "print('composer_list_len: %d' % composer_list_len)\n",
    "composer_index = dict()\n",
    "for i,composer in enumerate(composer_list):\n",
    "    composer_index[composer] = i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_item_matrix(data_all_train, item_index_dict, item_len):\n",
    "    item_matrix = ss.dok_matrix((data_all_train.shape[0], item_len), dtype='uint8')\n",
    "    # item_matrix = np.zeros([data_all_train.shape[0], item_len], dtype = 'int8')\n",
    "    for df_index in range(data_all_train.shape[0]):\n",
    "        line = data_all_train.iloc[df_index, :]\n",
    "        pattern = r'[||/]'\n",
    "        item_arr = re.split(pattern, str(line.composer))\n",
    "        for item in item_arr:\n",
    "             item = item.strip()\n",
    "             item_index = item_index_dict[item]\n",
    "             item_matrix[df_index,item_index] = 1\n",
    "    return item_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7377418"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR:root:Internal Python error in the inspect module.\n",
      "Below is the traceback from this internal error.\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Traceback (most recent call last):\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py\", line 3325, in run_code\n",
      "    exec(code_obj, self.user_global_ns, self.user_ns)\n",
      "  File \"<ipython-input-36-973fc687ea17>\", line 1, in <module>\n",
      "    composer_matrix = get_item_matrix(data_all_train,composer_index,composer_list_len)\n",
      "  File \"<ipython-input-35-4ed0a039587a>\", line 5, in get_item_matrix\n",
      "    line = data_all_train.iloc[df_index, :]\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py\", line 1494, in __getitem__\n",
      "    return self._getitem_tuple(key)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py\", line 2145, in _getitem_tuple\n",
      "    return self._getitem_lowerdim(tup)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py\", line 988, in _getitem_lowerdim\n",
      "    section = self._getitem_axis(key, axis=i)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py\", line 2232, in _getitem_axis\n",
      "    return self._get_loc(key, axis=axis)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py\", line 146, in _get_loc\n",
      "    return self.obj._ixs(key, axis=axis)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\", line 2861, in _ixs\n",
      "    dtype=new_values.dtype)\n",
      "KeyboardInterrupt\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py\", line 2039, in showtraceback\n",
      "    stb = value._render_traceback_()\n",
      "AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.py\", line 1101, in get_records\n",
      "    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.py\", line 319, in wrapped\n",
      "    return f(*args, **kwargs)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.py\", line 353, in _fixed_getinnerframes\n",
      "    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/inspect.py\", line 1502, in getinnerframes\n",
      "    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/inspect.py\", line 1460, in getframeinfo\n",
      "    filename = getsourcefile(frame) or getfile(frame)\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/inspect.py\", line 696, in getsourcefile\n",
      "    if getattr(getmodule(object, filename), '__loader__', None) is not None:\n",
      "  File \"/letv/hubo/anaconda3/lib/python3.7/inspect.py\", line 733, in getmodule\n",
      "    if ismodule(module) and hasattr(module, '__file__'):\n",
      "KeyboardInterrupt\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m"
     ]
    }
   ],
   "source": [
    "composer_matrix = get_item_matrix(data_all_train,composer_index,composer_list_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    # composer_matrix\n",
    "    sio.mmwrite(model_path+'composer_matrix.mtx', composer_matrix)\n",
    "except Exception as e:\n",
    "    print('mmwrite composer_matrix.mtx error:',e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 生成 artist_name 在 dataframe 中的列名\n",
    "composer_columns = list()\n",
    "for i in range(composer_list_len):\n",
    "    composer_columns.append('composer_'+str(i))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "ename": "MemoryError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-20-ecac2b1ba031>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcomposer_matrix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtodense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/letv/hubo/anaconda3/lib/python3.7/site-packages/scipy/sparse/base.py\u001b[0m in \u001b[0;36mtodense\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m    846\u001b[0m             \u001b[0;31m`\u001b[0m\u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatrix\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0mobject\u001b[0m \u001b[0mthat\u001b[0m \u001b[0mshares\u001b[0m \u001b[0mthe\u001b[0m \u001b[0msame\u001b[0m \u001b[0mmemory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    847\u001b[0m         \"\"\"\n\u001b[0;32m--> 848\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0masmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    849\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    850\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/letv/hubo/anaconda3/lib/python3.7/site-packages/scipy/sparse/base.py\u001b[0m in \u001b[0;36mtoarray\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m    878\u001b[0m             \u001b[0mappropriate\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    879\u001b[0m         \"\"\"\n\u001b[0;32m--> 880\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtocoo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    881\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    882\u001b[0m     \u001b[0;31m# Any sparse matrix format deriving from spmatrix must define one of\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/letv/hubo/anaconda3/lib/python3.7/site-packages/scipy/sparse/coo.py\u001b[0m in \u001b[0;36mtoarray\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m    321\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    322\u001b[0m         \u001b[0;34m\"\"\"See the docstring for `spmatrix.toarray`.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 323\u001b[0;31m         \u001b[0mB\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_toarray_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    324\u001b[0m         \u001b[0mfortran\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf_contiguous\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    325\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfortran\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mc_contiguous\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/letv/hubo/anaconda3/lib/python3.7/site-packages/scipy/sparse/base.py\u001b[0m in \u001b[0;36m_process_toarray_args\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m   1184\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1185\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1186\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1187\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mMemoryError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "print(type(composer_matrix.todense()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "ename": "MemoryError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-15-33ce344112ad>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcomposer_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcomposer_matrix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtodense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcomposer_columns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_all_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'uint8'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'composer_df.shape %d, %d'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mcomposer_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mcomposer_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/letv/hubo/anaconda3/lib/python3.7/site-packages/scipy/sparse/base.py\u001b[0m in \u001b[0;36mtodense\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m    846\u001b[0m             \u001b[0;31m`\u001b[0m\u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatrix\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0mobject\u001b[0m \u001b[0mthat\u001b[0m \u001b[0mshares\u001b[0m \u001b[0mthe\u001b[0m \u001b[0msame\u001b[0m \u001b[0mmemory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    847\u001b[0m         \"\"\"\n\u001b[0;32m--> 848\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0masmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    849\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    850\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/letv/hubo/anaconda3/lib/python3.7/site-packages/scipy/sparse/base.py\u001b[0m in \u001b[0;36mtoarray\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m    878\u001b[0m             \u001b[0mappropriate\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    879\u001b[0m         \"\"\"\n\u001b[0;32m--> 880\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtocoo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    881\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    882\u001b[0m     \u001b[0;31m# Any sparse matrix format deriving from spmatrix must define one of\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/letv/hubo/anaconda3/lib/python3.7/site-packages/scipy/sparse/coo.py\u001b[0m in \u001b[0;36mtoarray\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m    321\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    322\u001b[0m         \u001b[0;34m\"\"\"See the docstring for `spmatrix.toarray`.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 323\u001b[0;31m         \u001b[0mB\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_toarray_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    324\u001b[0m         \u001b[0mfortran\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf_contiguous\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    325\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfortran\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mc_contiguous\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/letv/hubo/anaconda3/lib/python3.7/site-packages/scipy/sparse/base.py\u001b[0m in \u001b[0;36m_process_toarray_args\u001b[0;34m(self, order, out)\u001b[0m\n\u001b[1;32m   1184\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1185\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1186\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1187\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mMemoryError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "composer_df = pd.DataFrame(composer_matrix.todense(), columns = composer_columns, index = data_all_train.index, dtype='uint8')  \n",
    "print('composer_df.shape %d, %d' % composer_df.shape)\n",
    "composer_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "convert to composer_df error, \n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    composer_df = pd.DataFrame(composer_matrix.todense(), columns = composer_columns, index = data_all_train.index, dtype='uint8')  \n",
    "    print('composer_df.shape %d, %d' % composer_df.shape)\n",
    "    composer_df.head()\n",
    "except Exception as e:\n",
    "    print('convert to composer_df error,',e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# composer_df 计算该 dataframe 占用内存大小\n",
    "composer_df_mem = composer_df.memory_usage().sum()/(1024**2)\n",
    "print('composer_df_mem memory usage: %f Mb' % composer_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 序列化到磁盘中\n",
    "try:\n",
    "    with open(model_path + 'composer_df.pkl','wb') as fw:\n",
    "        pk.dump(composer_df,fw)\n",
    "    fw.close()\n",
    "except Exception as e:\n",
    "    print('save composer_df.pkl error,',e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
